suppressPackageStartupMessages(library(tidyverse))
devtools::load_all(
'~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/DRS_diffthresh/')
tabledir <- paste0(wd, 'Tables/DRS_diffthresh/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
filter_KS_intensity_increase_pval_diff_median <-
function(pval, diff_med) {
sampcomp_results_joined |>
filter(
KS_intensity_pvalue_G < pval &
KS_intensity_pvalue_I < pval
) |>
filter(
c2_median_intensity_G - c1_median_intensity_G > diff_med &
c2_median_intensity_I - c1_median_intensity_I > diff_med
)
}
calc_midC_percentage <- function(df) {
df |>
mutate(mid_base = str_sub(ref_kmer, 3,3)) |>
group_by(mid_base) |>
reframe(n = n()) |>
mutate(percent = 100 * n / sum(n))
}
paste_wd <- function(path) {
paste0(wd, path)
}
Read data
sampcomp_results_joined <-
read_tsv(
'Tables/DRS/Positions/sampcomp_results_joined_2024-04-09.tsv.gz' |>
paste_wd()
)
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sampcomp_results_joined
## # A tibble: 5,884,004 × 67
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000264926.7 RAD18-201 1464 TCACA NA
## 2 ENST00000264926.7 RAD18-201 1465 CACAT 1
## 3 ENST00000264926.7 RAD18-201 1466 ACATA NA
## 4 ENST00000264926.7 RAD18-201 1467 CATAA 1
## 5 ENST00000264926.7 RAD18-201 1468 ATAAA NA
## 6 ENST00000264926.7 RAD18-201 1473 AACGA 1
## 7 ENST00000264926.7 RAD18-201 1475 CGATC NA
## 8 ENST00000264926.7 RAD18-201 1486 ACACA NA
## 9 ENST00000264926.7 RAD18-201 1501 CAAGA 1
## 10 ENST00000264926.7 RAD18-201 1502 AAGAC NA
## # ℹ 5,883,994 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
Check previous results
pval_thresh <- .05
dif_med_intensity <- 0
sampcomp_results_joined |>
filter(
KS_intensity_pvalue_G < pval_thresh &
KS_intensity_pvalue_I < pval_thresh
) |>
filter(
c2_median_intensity_G - c1_median_intensity_G > dif_med_intensity &
c2_median_intensity_I - c1_median_intensity_I > dif_med_intensity
)
## # A tibble: 605 × 67
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA 1
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA 1
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC 1
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA 1
## # ℹ 595 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
Try different combination
params <-
expand_grid(
pval = c(.001, .01, .05, .1),
diff_med = c(0, .01, .02, .03, .04, .05, .1, .2, 1)
)
params
## # A tibble: 36 × 2
## pval diff_med
## <dbl> <dbl>
## 1 0.001 0
## 2 0.001 0.01
## 3 0.001 0.02
## 4 0.001 0.03
## 5 0.001 0.04
## 6 0.001 0.05
## 7 0.001 0.1
## 8 0.001 0.2
## 9 0.001 1
## 10 0.01 0
## # ℹ 26 more rows
midbase_percent_in_different_threshold <-
params |>
pmap_dfr(function(pval, diff_med) {
data <- filter_KS_intensity_increase_pval_diff_median(pval = pval, diff_med = diff_med) |>
calc_midC_percentage()
tibble(
pval = pval,
diff_med = diff_med,
result = list(data)
)
}) |>
unnest(result)
midbase_percent_in_different_threshold |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_diffthresh/midbase_percent_in_different_threshold_2025-07-15.tsv
## # A tibble: 144 × 5
## pval diff_med mid_base n percent
## <dbl> <dbl> <chr> <int> <dbl>
## 1 0.001 0 A 26 12.6
## 2 0.001 0 C 159 77.2
## 3 0.001 0 G 2 0.971
## 4 0.001 0 T 19 9.22
## 5 0.001 0.01 A 26 12.6
## 6 0.001 0.01 C 159 77.2
## 7 0.001 0.01 G 2 0.971
## 8 0.001 0.01 T 19 9.22
## 9 0.001 0.02 A 26 12.6
## 10 0.001 0.02 C 159 77.2
## # ℹ 134 more rows
Plot
midbase_percent_in_different_threshold_heatmap <-
midbase_percent_in_different_threshold |>
filter(mid_base == 'C') |>
ggplot(aes(
x = pval |> as_factor(), y = diff_med |> as_factor(),
fill = percent,
label = scales::label_number(accuracy = .1)(percent))) +
geom_tile() +
geom_text()
midbase_percent_in_different_threshold_heatmap |>
ggsave_pdf(outdir = figdir, width = 8, height = 8)

midbase_percent_in_different_threshold_num_barplot <-
midbase_percent_in_different_threshold |>
filter(mid_base == 'C') |>
ggplot(aes(
x = paste('p < ', pval, ', Δmedian > ',diff_med),
y = n
)) +
labs(x = '') +
geom_bar(stat = 'identity') +
coord_flip()
midbase_percent_in_different_threshold_num_barplot |>
ggsave_pdf(outdir = figdir, width = 8, height = 8)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)

Revise supp Fig.
midbase_percent_in_different_threshold_Cpercent_barplot <-
midbase_percent_in_different_threshold |>
filter(diff_med == 0) |>
#filter(mid_base == 'C') |>
ggplot(aes(
x = paste('p < ', pval, ', Δmedian > ',diff_med),
y = percent,
fill = reorder(mid_base, percent)
)) +
geom_bar(stat = 'identity', position = position_stack()) +
scale_fill_manual(values = c('#01C001', '#E6E602', '#5051FF', '#E00800')) +
coord_flip()
midbase_percent_in_different_threshold_Cpercent_barplot |>
ggsave_pdf(outdir = figdir, width = 8, height = 4)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'paste("p < ", pval, ", Δmedian > ", diff_med)' in
## 'mbcsToSbcs': for Δ (U+0394)

midbase_percent_in_different_threshold_Cpercent_barplot_num <-
midbase_percent_in_different_threshold |>
filter(diff_med == 0) |>
#filter(mid_base == 'C') |>
ggplot(aes(
x = paste('p < ', pval, ', Δmedian > ',diff_med),
y = n,
fill = reorder(mid_base, percent)
)) +
geom_bar(stat = 'identity') +
scale_fill_manual(values = c('#01C001', '#E6E602', '#5051FF', '#E00800')) +
coord_flip()
midbase_percent_in_different_threshold_Cpercent_barplot_num |>
ggsave_pdf(outdir = figdir, width = 8, height = 4)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'paste("p < ", pval, ", Δmedian > ", diff_med)' in
## 'mbcsToSbcs': for Δ (U+0394)

midbase_percent_in_different_threshold |>
filter(diff_med == 0) |>
pivot_wider(
id_cols = c(pval, diff_med),
names_from = mid_base, values_from = c(n, percent)
)
## # A tibble: 4 × 10
## pval diff_med n_A n_C n_G n_T percent_A percent_C percent_G percent_T
## <dbl> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.001 0 26 159 2 19 12.6 77.2 0.971 9.22
## 2 0.01 0 48 295 2 26 12.9 79.5 0.539 7.01
## 3 0.05 0 68 489 8 40 11.2 80.8 1.32 6.61
## 4 0.1 0 92 605 10 61 12.0 78.8 1.30 7.94
Export fasta
export_increased_intensity_sites_as_fasta <- function(pval_thresh) {
fasta_basename <- paste0('sites_with_increased_current_intensity_pval_', pval_thresh)
fasta_dir <- 'Fasta/DRS_diffthresh/'
filter_KS_intensity_increase_pval_diff_median(pval = pval_thresh, diff_med = 0) |>
mutate(name = paste0(transcript_id, '|', position)) |>
select(name, ref_kmer) |>
export_as_fasta(
name = name, sequence = ref_kmer,
outdir = fasta_dir, fasta_basename = fasta_basename, compression = ''
)
}
pval = c(.001, .01, .05, .1) |>
walk(export_increased_intensity_sites_as_fasta)
##
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.001.fa
##
##
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.01.fa
##
##
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.05.fa
##
##
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.1.fa